Background

Notebook to play with slope coefficients before putting them into the report.

Data setup

# Fetch the following sources and signals from the API 
# TODO: Add Google Symptoms "eventually"
source_names = c("doctor-visits", "fb-survey", "fb-survey", "hospital-admissions")
signal_names = c("smoothed_adj_cli", "smoothed_cli", "smoothed_hh_cmnty_cli", 
            "smoothed_adj_covid19")
pretty_names = c("Doctor visits", "Facebook CLI", "Facebook CLI-in-community", 
          "Hospitalizations")
target_names = c("Cases", "Cases", "Cases", "Deaths")
geo_level = "county"

start_day = "2020-04-15"
end_day = NULL
cache_fname = 'cached_data/03_heterogeneity_core_indicators.RDS'

if (!file.exists(cache_fname)) {
  df_signals = vector("list", length(signal_names))
  for (i in 1:length(signal_names)) {
    df_signals[[i]] = suppressWarnings(
                        covidcast_signal(source_names[i], signal_names[i],
                                         start_day, end_day,
                                         geo_type=geo_level))
  }

  # Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day 
  # trailing average)
  df_cases = suppressWarnings(
              covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
                              start_day, end_day,
                              geo_type=geo_level))

  df_deaths = suppressWarnings(
              covidcast_signal("usa-facts", "deaths_7dav_incidence_prop",
                              start_day, end_day,
                              geo_type=geo_level))

  case_num = 500
  geo_values = suppressWarnings(covidcast_signal("usa-facts", "confirmed_cumulative_num",
                                max(df_cases$time_value), 
                                max(df_cases$time_value))) %>%
    filter(value >= case_num) %>% pull(geo_value)
  saveRDS(list(df_signals, df_cases, df_deaths), cache_fname)
} else {
  cached_data = readRDS(cache_fname)
  df_signals = cached_data[[1]]
  df_cases = cached_data[[2]]
  df_deaths = cached_data[[3]]
}

Setup

sensorize_time_ranges = list(
      c(-7, -1),
      c(-10, -1),
      c(-14, -1),
      c(-21, -1))
QUANTS = c(0.01, 0.99)

# TODO: Add more "core indicators"

for (ind_idx in 1:length(source_names)) {
  if (target_names[ind_idx] == 'Cases') {
    df_target = df_cases
  } else if (target_names[ind_idx] == 'Deaths') {
    df_target = df_deaths
  } else {
    stop(sprintf("No matching dataframe for target %s.", target_names[ind_idx]))
  }

  base_cor_fname = sprintf('results/03_base_cors_%s_%s.RDS',
                            source_names[ind_idx], signal_names[ind_idx])
  sensorize_fname = sprintf('results/03_sensorize_cors_%s_%s.RDS',
                            source_names[ind_idx], signal_names[ind_idx])
  sensorize_val_fname = sprintf('results/03_sensorize_vals_%s_%s.RDS',
                            source_names[ind_idx], signal_names[ind_idx])

  df_cor_base = readRDS(base_cor_fname)
  sensorize_cors = readRDS(sensorize_fname)
  sensorized_vals = readRDS(sensorize_val_fname)

  for (inner_idx in 1:length(sensorize_time_ranges)) {
    sv = sensorized_vals[[inner_idx]]
    print(summary(sv$slope))
    print(slope_limits <- quantile(sv$slope, QUANTS, na.rm=TRUE))
    plt = ggplot(
      sensorized_vals[[inner_idx]],
      aes(x=time_value,
          y=slope),
    ) + geom_point (
      alpha=0.1,
      size=0.5,
    ) + geom_hline (
      yintercept=0,
      colour='white',
    ) + stat_summary (
        aes(y=slope,
            group=1,
            colour='median'),
        fun=median,
        geom="line",
        group=1,
    ) + stat_summary (
        aes(y=slope,
            group=1,
            colour='+/- mad'),
        fun=function(x) { median(x) + mad(x) },
        geom="line",
        group=1,
    ) + stat_summary (
        aes(y=slope,
            group=1,
            colour='+/- mad'),
        fun=function(x) { median(x) - mad(x) },
        geom="line",
        group=1,
    ) + scale_colour_manual(
        values=c("median"="maroon",
                 "+/- mad"="darkgreen")
    ) + labs(
      colour=''
    ) + ggtitle(
      sprintf("Slope distribution for %s, fitted on t in %d:%d",
              pretty_names[ind_idx],
              sensorize_time_ranges[[inner_idx]][1],
              sensorize_time_ranges[[inner_idx]][2])
    ) + ylim (
      slope_limits[[1]], slope_limits[[2]]
    )
    print(plt)
  }
}
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  -3488425        -1         0      3085         2 574876473     11718 
##        1%       99% 
## -26.70333  27.73314
## Warning: Removed 19566 rows containing non-finite values (stat_summary).

## Warning: Removed 19566 rows containing non-finite values (stat_summary).

## Warning: Removed 19566 rows containing non-finite values (stat_summary).
## Warning: Removed 19566 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##  -3693626        -1         0      3525         2 582794882      9406 
##        1%       99% 
## -21.19657  23.05324
## Warning: Removed 17198 rows containing non-finite values (stat_summary).
## Warning: Removed 17198 rows containing non-finite values (stat_summary).

## Warning: Removed 17198 rows containing non-finite values (stat_summary).
## Warning: Removed 17198 rows containing missing values (geom_point).

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -3693626       -1        0      844        2 52206253     7469 
##        1%       99% 
## -16.57598  19.69155
## Warning: Removed 15161 rows containing non-finite values (stat_summary).
## Warning: Removed 15161 rows containing non-finite values (stat_summary).

## Warning: Removed 15161 rows containing non-finite values (stat_summary).
## Warning: Removed 15161 rows containing missing values (geom_point).

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -3693626       -1        0     1094        2 53331161     5355 
##        1%       99% 
## -12.64440  16.78322
## Warning: Removed 12829 rows containing non-finite values (stat_summary).
## Warning: Removed 12829 rows containing non-finite values (stat_summary).

## Warning: Removed 12829 rows containing non-finite values (stat_summary).
## Warning: Removed 12829 rows containing missing values (geom_point).

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -155949.92      -2.33       0.03      -0.61       2.72   62730.15 
##       NA's 
##       6648 
##        1%       99% 
## -48.45862  50.61435
## Warning: Removed 9554 rows containing non-finite values (stat_summary).
## Warning: Removed 9554 rows containing non-finite values (stat_summary).

## Warning: Removed 9554 rows containing non-finite values (stat_summary).
## Warning: Removed 9554 rows containing missing values (geom_point).

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -155949.92      -2.23       0.05       0.08       2.72   61821.25 
##       NA's 
##       4526 
##        1%       99% 
## -37.70329  41.63569
## Warning: Removed 7438 rows containing non-finite values (stat_summary).
## Warning: Removed 7438 rows containing non-finite values (stat_summary).

## Warning: Removed 7438 rows containing non-finite values (stat_summary).
## Warning: Removed 7438 rows containing missing values (geom_point).

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -155949.92      -2.09       0.10       0.32       2.80   61821.25 
##       NA's 
##       2852 
##        1%       99% 
## -31.65871  38.37726
## Warning: Removed 5734 rows containing non-finite values (stat_summary).
## Warning: Removed 5734 rows containing non-finite values (stat_summary).

## Warning: Removed 5734 rows containing non-finite values (stat_summary).
## Warning: Removed 5734 rows containing missing values (geom_point).

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -155949.92      -1.80       0.19       0.22       2.95   21406.72 
##       NA's 
##       1357 
##        1%       99% 
## -25.26378  36.80205
## Warning: Removed 4137 rows containing non-finite values (stat_summary).
## Warning: Removed 4137 rows containing non-finite values (stat_summary).

## Warning: Removed 4137 rows containing non-finite values (stat_summary).
## Warning: Removed 4137 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -300.3856   -0.2111    0.0461    0.1520    0.4261  255.1363      1504 
##        1%       99% 
## -3.450832  4.676999
## Warning: Removed 4510 rows containing non-finite values (stat_summary).
## Warning: Removed 4510 rows containing non-finite values (stat_summary).

## Warning: Removed 4510 rows containing non-finite values (stat_summary).
## Warning: Removed 4510 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -148.2433   -0.1868    0.0644    0.1960    0.4649  255.1363      1065 
##        1%       99% 
## -3.041596  4.581914
## Warning: Removed 4043 rows containing non-finite values (stat_summary).
## Warning: Removed 4043 rows containing non-finite values (stat_summary).

## Warning: Removed 4043 rows containing non-finite values (stat_summary).
## Warning: Removed 4043 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -148.2433   -0.1475    0.0917    0.2493    0.5221  255.1363       683 
##        1%       99% 
## -2.656265  4.402796
## Warning: Removed 3609 rows containing non-finite values (stat_summary).
## Warning: Removed 3609 rows containing non-finite values (stat_summary).

## Warning: Removed 3609 rows containing non-finite values (stat_summary).
## Warning: Removed 3609 rows containing missing values (geom_point).

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -148.24331   -0.08659    0.14417    0.34217    0.62492  108.68948 
##       NA's 
##        309 
##        1%       99% 
## -2.136397  4.141479
## Warning: Removed 3111 rows containing non-finite values (stat_summary).
## Warning: Removed 3111 rows containing non-finite values (stat_summary).

## Warning: Removed 3111 rows containing non-finite values (stat_summary).
## Warning: Removed 3111 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -111.8991   -0.0313    0.0000    0.0042    0.0391  201.5931      1298 
##        1%       99% 
## -1.147600  1.217261
## Warning: Removed 3192 rows containing non-finite values (stat_summary).
## Warning: Removed 3192 rows containing non-finite values (stat_summary).

## Warning: Removed 3192 rows containing non-finite values (stat_summary).
## Warning: Removed 3192 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -111.8991   -0.0276    0.0000    0.0012    0.0355  171.3647      1116 
##         1%        99% 
## -0.8680358  0.8966277
## Warning: Removed 2988 rows containing non-finite values (stat_summary).
## Warning: Removed 2988 rows containing non-finite values (stat_summary).

## Warning: Removed 2988 rows containing non-finite values (stat_summary).
## Warning: Removed 2988 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -111.8991   -0.0228    0.0000   -0.0009    0.0318   58.1832       884 
##         1%        99% 
## -0.6537521  0.6697761
## Warning: Removed 2720 rows containing non-finite values (stat_summary).
## Warning: Removed 2720 rows containing non-finite values (stat_summary).

## Warning: Removed 2720 rows containing non-finite values (stat_summary).
## Warning: Removed 2720 rows containing missing values (geom_point).

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -111.8991   -0.0169    0.0000    0.0005    0.0283   50.4421       628 
##         1%        99% 
## -0.4700265  0.4854451
## Warning: Removed 2390 rows containing non-finite values (stat_summary).
## Warning: Removed 2390 rows containing non-finite values (stat_summary).

## Warning: Removed 2390 rows containing non-finite values (stat_summary).
## Warning: Removed 2390 rows containing missing values (geom_point).

Why is the distribution of slope coefficients centered on zero? (Shouldn’t we expect the center of the distribution to be positive?) It might be possible that the distribution is not centered on zero once we condition on location (but unlikely). We can take a look anyways…

sensorize_time_ranges = list(
      c(-7, -1),
      c(-10, -1),
      c(-14, -1),
      c(-21, -1))
QUANTS = c(0.01, 0.99)

# TODO: Add more "core indicators"

for (ind_idx in 1:length(source_names)) {
  if (target_names[ind_idx] == 'Cases') {
    df_target = df_cases
  } else if (target_names[ind_idx] == 'Deaths') {
    df_target = df_deaths
  } else {
    stop(sprintf("No matching dataframe for target %s.", target_names[ind_idx]))
  }

  base_cor_fname = sprintf('results/03_base_cors_%s_%s.RDS',
                            source_names[ind_idx], signal_names[ind_idx])
  sensorize_fname = sprintf('results/03_sensorize_cors_%s_%s.RDS',
                            source_names[ind_idx], signal_names[ind_idx])
  sensorize_val_fname = sprintf('results/03_sensorize_vals_%s_%s.RDS',
                            source_names[ind_idx], signal_names[ind_idx])

  df_cor_base = readRDS(base_cor_fname)
  sensorize_cors = readRDS(sensorize_fname)
  sensorized_vals = readRDS(sensorize_val_fname)

  for (inner_idx in 1:length(sensorize_time_ranges)) {
    sv = sensorized_vals[[inner_idx]]
    sv_medmad = sv %>% group_by(
      geo_value
    ) %>% summarize (
      med = median(slope),
      mad = mad(slope),
      med_upper = med+mad,
      med_lower = med-mad,
    )
    coverage = sv_medmad %>% mutate (
      contains_zero = (0 < med_upper) & (0 > med_lower),
    ) %>% pull (
      contains_zero
    ) %>% mean (
      na.rm=TRUE
    )
    print(sprintf('Coverage for %s: %f',
                  pretty_names[ind_idx],
                  coverage))
  }
}
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Doctor visits: 0.999278"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Doctor visits: 1.000000"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Doctor visits: 0.998764"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Doctor visits: 0.984410"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI: 1.000000"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI: 1.000000"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI: 0.995320"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI: 0.963975"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI-in-community: 1.000000"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI-in-community: 0.996800"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI-in-community: 0.993404"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Facebook CLI-in-community: 0.925926"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Hospitalizations: 0.936709"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Hospitalizations: 0.926941"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Hospitalizations: 0.910204"
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Coverage for Hospitalizations: 0.870216"